MoneyBallR_Modeling
## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
## Use `force = TRUE` to force installation
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
#Load in the Data
## -- Attaching packages -------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.3.2
## v tidyr 0.8.3 v dplyr 0.8.0.1
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::arrange() masks plyr::arrange()
## x readr::col_factor() masks scales::col_factor()
## x purrr::compact() masks plyr::compact()
## x dplyr::count() masks plyr::count()
## x purrr::discard() masks scales::discard()
## x dplyr::failwith() masks plyr::failwith()
## x dplyr::filter() masks stats::filter()
## x dplyr::id() masks plyr::id()
## x dplyr::lag() masks stats::lag()
## x dplyr::mutate() masks plyr::mutate()
## x dplyr::rename() masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
## corrplot 0.84 loaded
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
##
## Attaching package: 'DMwR'
## The following object is masked from 'package:plyr':
##
## join
df_train <- read.csv("https://raw.githubusercontent.com/crarnouts/Data_621/master/moneyball-training-data.csv")
df_train$INDEX <- NULL
df_test <- read.csv("https://raw.githubusercontent.com/crarnouts/Data_621/master/moneyball-evaluation-data.csv")
df_test$INDEX <- NULL
source("https://raw.githubusercontent.com/crarnouts/Data_605_Final/master/RandomForestNulls_testing.R")## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:plyr':
##
## empty
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
## Loading required package: libcoin
##
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner,
## node_surv, node_terminal, varimp
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'memisc'
## The following objects are masked from 'package:modeltools':
##
## Lapply, relabel
## The following objects are masked from 'package:dplyr':
##
## collect, recode, rename, syms
## The following object is masked from 'package:purrr':
##
## %@%
## The following object is masked from 'package:scales':
##
## percent
## The following object is masked from 'package:plyr':
##
## rename
## The following object is masked from 'package:ggplot2':
##
## syms
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
##
## as.array
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:memisc':
##
## rename, style
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
colnames(df_train) <- sub("TEAM_", "", colnames(df_train))
colnames(df_test) <- sub("TEAM_", "", colnames(df_test))#Missing Variables
df_train %>%
gather(variable, value) %>%
filter(is.na(value)) %>%
group_by(variable) %>%
tally() %>%
mutate(percent = n / nrow(df_train) * 100) %>%
mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
arrange(desc(n)) %>%
rename(`Variable Missing Data` = variable,
`Number of Records` = n,
`Share of Total` = percent) %>%
kable() %>%
kable_styling()| Variable Missing Data | Number of Records | Share of Total |
|---|---|---|
| BATTING_HBP | 2085 | 92% |
| BASERUN_CS | 772 | 34% |
| FIELDING_DP | 286 | 13% |
| BASERUN_SB | 131 | 5.8% |
| BATTING_SO | 102 | 4.5% |
| PITCHING_SO | 102 | 4.5% |
See if we can impute some misssing values Using a Random Forest
df_train[which(df_train$TARGET_WINS == 0), "TARGET_WINS"] <- round(mean(df_train$TARGET_WINS), 0)
# Change 0's to NA so they too can be imputed
df_train <- df_train %>%
mutate(BATTING_SO = ifelse(BATTING_SO == 0, NA, BATTING_SO))
knn <- df_train %>% knnImputation()
impute_me <- is.na(df_train$BATTING_SO)
df_train[impute_me,"BATTING_SO"] <- knn[impute_me,"BATTING_SO"]
impute_me <- is.na(df_train$BASERUN_SB)
df_train[impute_me,"BASERUN_SB"] <- knn[impute_me,"BASERUN_SB"]
impute_me <- is.na(df_train$BASERUN_CS)
df_train[impute_me,"BASERUN_CS"] <- knn[impute_me,"BASERUN_CS"]
impute_me <- is.na(df_train$BATTING_HBP)
df_train[impute_me,"BATTING_HBP"] <- knn[impute_me,"BATTING_HBP"]
df_train[which(df_train$PITCHING_SO > 5346),"PITCHING_SO"] <- NA
impute_me <- is.na(df_train$PITCHING_SO)
df_train[impute_me,"PITCHING_SO"] <- knn[impute_me,"PITCHING_SO"]
impute_me <- is.na(df_train$FIELDING_DP)
df_train[impute_me,"FIELDING_DP"] <- knn[impute_me,"FIELDING_DP"]
# Change 0's to NA so they too can be imputed
df_test <- df_test %>%
mutate(BATTING_SO = ifelse(BATTING_SO == 0, NA, BATTING_SO))
knn <- df_test %>% knnImputation()
impute_me <- is.na(df_test$BATTING_SO)
df_test[impute_me,"BATTING_SO"] <- knn[impute_me,"BATTING_SO"]
impute_me <- is.na(df_test$BASERUN_SB)
df_test[impute_me,"BASERUN_SB"] <- knn[impute_me,"BASERUN_SB"]
impute_me <- is.na(df_test$BASERUN_CS)
df_test[impute_me,"BASERUN_CS"] <- knn[impute_me,"BASERUN_CS"]
impute_me <- is.na(df_test$BATTING_HBP)
df_test[impute_me,"BATTING_HBP"] <- knn[impute_me,"BATTING_HBP"]
df_test[which(df_test$PITCHING_SO > 5346),"PITCHING_SO"] <- NA
impute_me <- is.na(df_test$PITCHING_SO)
df_test[impute_me,"PITCHING_SO"] <- knn[impute_me,"PITCHING_SO"]
impute_me <- is.na(df_test$FIELDING_DP)
df_test[impute_me,"FIELDING_DP"] <- knn[impute_me,"FIELDING_DP"] #Correlation Matrix
na_count <-sapply(df_train, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
library(corrplot)
M <- cor(df_train)
corrplot(M, method = "circle") #plot matrix## Loading required package: xts
##
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
#Feature Creation and Data Cleaning
df_train$BATTING_1B <- df_train$BATTING_H - df_train$BATTING_2B - df_train$BATTING_3B - df_train$BATTING_HR
df_test$BATTING_1B <- df_test$BATTING_H - df_test$BATTING_2B - df_test$BATTING_3B - df_test$BATTING_HR#Look at Linear Model with all First Order Predictors
train.index1 <- createDataPartition(df_train$TARGET_WINS, p = .7, list = FALSE)
train_data<- df_train[ train.index1,]
hold_out_data <- df_train[-train.index1,]
model <- lm(TARGET_WINS ~ ., data = train_data)
hold_out_data$Prediction <- predict(model,hold_out_data)## Warning in predict.lm(model, hold_out_data): prediction from a rank-
## deficient fit may be misleading
oos_cor <- paste(round(cor(hold_out_data$Prediction,hold_out_data$TARGET_WINS),digits = 2))
summary(model)##
## Call:
## lm(formula = TARGET_WINS ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.832 -8.562 -0.082 8.595 68.412
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.4283905 7.5603316 -0.189 0.850170
## BATTING_H 0.0498665 0.0043376 11.496 < 2e-16 ***
## BATTING_2B -0.0203716 0.0111070 -1.834 0.066823 .
## BATTING_3B 0.0734573 0.0209907 3.500 0.000479 ***
## BATTING_HR 0.0432008 0.0337196 1.281 0.200320
## BATTING_BB 0.0181073 0.0071850 2.520 0.011828 *
## BATTING_SO 0.0004028 0.0026430 0.152 0.878876
## BASERUN_SB 0.0248181 0.0052309 4.744 2.28e-06 ***
## BASERUN_CS -0.0125798 0.0183880 -0.684 0.493991
## BATTING_HBP 0.2719421 0.0734697 3.701 0.000222 ***
## PITCHING_H 0.0003889 0.0004152 0.937 0.349036
## PITCHING_HR -0.0084095 0.0294120 -0.286 0.774977
## PITCHING_BB -0.0015055 0.0051259 -0.294 0.769016
## PITCHING_SO 0.0021441 0.0014467 1.482 0.138504
## FIELDING_E -0.0179990 0.0029202 -6.164 9.00e-10 ***
## FIELDING_DP -0.1206721 0.0153576 -7.857 7.19e-15 ***
## BATTING_1B NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.37 on 1579 degrees of freedom
## Multiple R-squared: 0.2956, Adjusted R-squared: 0.2889
## F-statistic: 44.17 on 15 and 1579 DF, p-value: < 2.2e-16
first_order_linear <- ggplot(data = hold_out_data, aes(x = Prediction, y = TARGET_WINS)) +
geom_point(color='blue') +
geom_smooth(method = "lm", se = FALSE)+ggtitle(paste("First Order Linear Model with Out of Sample Correlation of: ",oos_cor))#Look at some Decision Trees for Variable Interactions
##Look at Variance Inflation Factors
df_train1 <- df_train
# df_train1$BATTING_H <- NULL
# df_train1$BATTING_SO<- NULL
# df_train1$HR_RATIO <- NULL
# df_train1$BASES_ACQUIRED <- NULL
# df_train1$Hitting_Ratio <- NULL
# df_train1$PITCHING_HR <- NULL
for (i in 3:ncol(df_train1)){
col <- noquote(paste(colnames(df_train1)[i],"~ ."))
model <- lm(col, data = df_train1)
r_squared <- summary(model)$adj.r.squared
VIF <- 1/(1-(r_squared))
print(colnames(df_train1)[i])
print(VIF)
#print(summary(model))
}## [1] "BATTING_2B"
## [1] Inf
## [1] "BATTING_3B"
## [1] Inf
## [1] "BATTING_HR"
## [1] Inf
## [1] "BATTING_BB"
## [1] 6.774763
## [1] "BATTING_SO"
## [1] 3.228496
## [1] "BASERUN_SB"
## [1] 1.87324
## [1] "BASERUN_CS"
## [1] 1.171899
## [1] "BATTING_HBP"
## [1] 1.237874
## [1] "PITCHING_H"
## [1] 3.532532
## [1] "PITCHING_HR"
## [1] 29.0385
## [1] "PITCHING_BB"
## [1] 6.31173
## [1] "PITCHING_SO"
## [1] 2.93079
## [1] "FIELDING_E"
## [1] 4.304486
## [1] "FIELDING_DP"
## [1] 1.374294
## Warning in summary.lm(model): essentially perfect fit: summary may be
## unreliable
## [1] "BATTING_1B"
## [1] Inf
#Stepwise Regression
train.index1 <- createDataPartition(df_train$TARGET_WINS, p = .7, list = FALSE)
train_data<- df_train[ train.index1,]
hold_out_data <- df_train[-train.index1,]
full_formula <- "TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB + BATTING_SO + BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP + BATTING_1B + I(BATTING_2B^2) + I(BATTING_3B^2) + I(BATTING_HR^2) + I(BATTING_BB^2) + I(BATTING_SO^2) + I(BASERUN_SB^2) + I(BASERUN_CS^2) + I(PITCHING_H^2) + I(PITCHING_HR^2) + I(PITCHING_BB^2) + I(PITCHING_SO^2) + I(FIELDING_E^2) + I(FIELDING_DP^2) + I(BATTING_1B^2) + I(BATTING_2B^3) + I(BATTING_3B^3) + I(BATTING_HR^3) + I(BATTING_BB^3) + I(BATTING_SO^3) + I(BASERUN_SB^3) + I(BASERUN_CS^3) + I(PITCHING_H^3) + I(PITCHING_HR^3) + I(PITCHING_BB^3) + I(PITCHING_SO^3) + I(FIELDING_E^3) + I(FIELDING_DP^3) + I(BATTING_1B^3) + I(BATTING_2B^4) + I(BATTING_3B^4) + I(BATTING_HR^4) + I(BATTING_BB^4) + I(BATTING_SO^4) + I(BASERUN_SB^4) + I(BASERUN_CS^4) + I(PITCHING_H^4) + I(PITCHING_HR^4) + I(PITCHING_BB^4) + I(PITCHING_SO^4) + I(FIELDING_E^4) + I(FIELDING_DP^4) + I(BATTING_1B^4)"
full_model <- lm(full_formula, train_data)
step_back_model <- stepAIC(full_model, direction="backward", trace = F)
poly.call <- summary(step_back_model)$call
final_model <- lm(poly.call[2], train_data)
summary(final_model)##
## Call:
## lm(formula = poly.call[2], data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.830 -7.556 0.164 7.283 57.160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.415e+01 1.868e+01 -2.900 0.00379 **
## BATTING_2B 8.417e-01 1.423e-01 5.915 4.06e-09 ***
## BATTING_3B -3.134e-01 2.191e-01 -1.430 0.15278
## PITCHING_H 2.497e-02 4.318e-03 5.784 8.80e-09 ***
## PITCHING_HR -1.370e-01 8.782e-02 -1.560 0.11893
## PITCHING_BB 3.619e-01 5.721e-02 6.326 3.27e-10 ***
## PITCHING_SO -1.676e-02 3.168e-03 -5.289 1.40e-07 ***
## FIELDING_E -2.072e-01 2.623e-02 -7.900 5.23e-15 ***
## I(BATTING_2B^2) -2.457e-03 4.135e-04 -5.942 3.46e-09 ***
## I(BATTING_3B^2) 1.146e-02 4.281e-03 2.678 0.00748 **
## I(BATTING_BB^2) -7.817e-04 1.353e-04 -5.778 9.14e-09 ***
## I(BATTING_SO^2) 6.141e-05 7.866e-06 7.807 1.07e-14 ***
## I(BASERUN_SB^2) 4.044e-04 8.483e-05 4.767 2.04e-06 ***
## I(BASERUN_CS^2) -1.102e-03 3.738e-04 -2.948 0.00325 **
## I(PITCHING_H^2) -5.094e-06 6.362e-07 -8.007 2.28e-15 ***
## I(PITCHING_HR^2) 2.366e-03 1.142e-03 2.072 0.03842 *
## I(PITCHING_BB^2) -5.812e-04 9.382e-05 -6.195 7.46e-10 ***
## I(FIELDING_E^2) 3.527e-04 6.322e-05 5.579 2.85e-08 ***
## I(BATTING_1B^2) 1.726e-05 2.087e-06 8.270 2.84e-16 ***
## I(BATTING_3B^3) -9.213e-05 3.217e-05 -2.864 0.00424 **
## I(BATTING_BB^3) 1.749e-06 2.994e-07 5.842 6.28e-09 ***
## I(BATTING_SO^3) -4.232e-08 5.704e-09 -7.420 1.92e-13 ***
## I(BASERUN_SB^3) -1.089e-06 3.483e-07 -3.128 0.00180 **
## I(BASERUN_CS^3) 5.896e-06 2.157e-06 2.733 0.00634 **
## I(PITCHING_H^3) 3.186e-10 3.902e-11 8.164 6.65e-16 ***
## I(PITCHING_HR^3) -1.006e-05 5.630e-06 -1.787 0.07410 .
## I(PITCHING_BB^3) 3.517e-07 6.088e-08 5.776 9.23e-09 ***
## I(PITCHING_SO^3) 3.523e-10 2.124e-10 1.658 0.09748 .
## I(FIELDING_E^3) -2.685e-07 5.688e-08 -4.721 2.56e-06 ***
## I(FIELDING_DP^3) -8.793e-06 1.851e-06 -4.751 2.21e-06 ***
## I(BATTING_2B^4) 5.703e-09 9.815e-10 5.811 7.53e-09 ***
## I(BATTING_3B^4) 2.232e-07 7.876e-08 2.833 0.00467 **
## I(BATTING_BB^4) -1.014e-09 1.899e-10 -5.341 1.06e-07 ***
## I(BASERUN_SB^4) 8.927e-10 3.627e-10 2.461 0.01395 *
## I(PITCHING_H^4) -5.817e-15 7.401e-16 -7.861 7.09e-15 ***
## I(PITCHING_HR^4) 1.373e-08 9.154e-09 1.500 0.13389
## I(PITCHING_BB^4) -6.801e-11 1.307e-11 -5.205 2.20e-07 ***
## I(FIELDING_E^4) 6.626e-11 1.694e-11 3.911 9.58e-05 ***
## I(FIELDING_DP^4) 3.539e-08 8.693e-09 4.072 4.90e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.75 on 1556 degrees of freedom
## Multiple R-squared: 0.4613, Adjusted R-squared: 0.4482
## F-statistic: 35.07 on 38 and 1556 DF, p-value: < 2.2e-16
#Model Results on the Hold Out Dataset
hold_out_data$Prediction <- predict(final_model,hold_out_data)
hold_out_data <- hold_out_data %>% filter(Prediction < 182)%>% filter(Prediction > 0)
cor(hold_out_data$Prediction,hold_out_data$TARGET_WINS)## [1] 0.6128171
oos_cor <- paste(round(cor(hold_out_data$Prediction,hold_out_data$TARGET_WINS),digits = 2))
ggplotly(ggplot(hold_out_data, aes(x=Prediction, y=TARGET_WINS, colour = TARGET_WINS)) +
geom_point(shape=1) +
scale_color_gradient2(low = "blue",
high = "red", space = "Lab" ) +
geom_smooth(method = "lm", se = FALSE, colour = "orange")+ggtitle(paste("Polynomial Stepwise Model with Out of Sample Correlation of: ",oos_cor)) # Use hollow circles
)